Introduction

Practicing “turning knobs” to get a better sense of how the different parameters work. I am simulated data set 17.

Simulated data

Data were simulated across a square grid of 100 X 100 cells with depth contours.

sim.dat = readRDS("coursework/simulations/sim_data/sim_dat_17.RDS")
head(sim.dat)
##   year X Y      eta  observed eta_scaled observed_scaled depth_scaled
## 1    1 1 1 17.04969 26.029819  0.7428023       0.7252767    -1.639687
## 2    1 2 1 17.37308 30.805359  0.7568914       0.8583390    -1.572175
## 3    1 3 1 17.69647 22.991022  0.7709806       0.6406058    -1.504663
## 4    1 4 1 18.01986  9.307899  0.7850697       0.2593488    -1.437152
## 5    1 5 1 18.34325 24.644069  0.7991588       0.6866651    -1.369640
## 6    1 6 1 18.66665 24.320600  0.8132480       0.6776522    -1.302129
grid.dat = readRDS("coursework/simulations/sim_data/grid.RDS")
grid.dat$negDepth = -1 * grid.dat$depth
head(grid.dat)
##   X Y depth year depth_scaled negDepth
## 1 1 1     1    1    -1.639687       -1
## 2 2 1     2    1    -1.572175       -2
## 3 3 1     3    1    -1.504663       -3
## 4 4 1     4    1    -1.437152       -4
## 5 5 1     5    1    -1.369640       -5
## 6 6 1     6    1    -1.302129       -6

There are eight (8) columns in a simulated dataset: - year: the year, being 1:6; - X: the location along the x-axis; - Y: the location along the y-axis; - eta: the simulated values; - observed: the observed value of the simulated data (i.e. observation error applied to eta); - eta_scaled: the simulated data scaled by the mean of simulated data in a reference simulated dataset; - observed_scaled: the observed data scaled by the mean of observations in a reference simulated dataset; - depth_scaled: the depth scaled by subtracting the mean and dividing by the standard deviation.

#plot grid
ggplot(subset(grid.dat, year == 1), aes(x = X, y = Y)) +
  geom_raster(aes(fill = negDepth)) +
  scale_fill_viridis() + 
  scale_x_continuous(expand = c(0, 0)) + # Remove expansion on x-axis
  scale_y_continuous(expand = c(0, 0)) # Remove expansion on x-axis

#plot eta
ggplot(subset(sim.dat, year == 1), aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  scale_fill_viridis() +  
  scale_x_continuous(expand = c(0, 0)) + # Remove expansion on x-axis
  scale_y_continuous(expand = c(0, 0)) # Remove expansion on x-axis

#plot observed
ggplot(subset(sim.dat, year == 1), aes(x = X, y = Y)) +
  geom_raster(aes(fill = observed_scaled)) +
  scale_fill_viridis() + 
  scale_x_continuous(expand = c(0, 0)) + # Remove expansion on x-axis
  scale_y_continuous(expand = c(0, 0)) # Remove expansion on x-axis

The grid shows deeper depths in the middle along x-axis. The simulated data (eta) shows variation throughout X and Y space. The simulated data (observed) shows grainier data and less smoothing across the patches.

Sampling from the simulated data

Using simple random sampling (SRS). Testing process for just one year. Implementing dplyr version provided in second example because this is usually how I like to code.

source("coursework/simulations/functions_sampling.R")
set.seed(245)
sample.dat.yr = sim.dat %>% filter(year == 1) %>% slice_sample(n = 100) # taking 100 random samples

# create sample (as if you were a boat sampling once in each sampled grid cell)
obsCV = 0.2  # if using "observed" set this to zero (observation error already applied)
catchabilityPars = list(mean=1,var=0.1) #related to gear selectivity (and availability to net)
sample = sampleGrid.fn(sample.dat.yr, obsCV, catchabilityPars, varName="eta_scaled")

#plot sample
ggplot(sample, aes(x = X, y = Y)) +
  geom_raster(aes(fill = observation)) +
  scale_fill_viridis() +  
  scale_x_continuous(expand = c(0, 0)) + # Remove expansion on x-axis
  scale_y_continuous(expand = c(0, 0)) # Remove expansion on x-axis
## Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
## ℹ Consider using `geom_tile()` instead.
## Raster pixels are placed at uneven horizontal intervals and will be shifted
## ℹ Consider using `geom_tile()` instead.

Using SRS across all years (6 in total).

# sample from each year of survey data
sample.dat.all.yr = sim.dat %>% group_by(year) %>% slice_sample(n = 100)

samples = sampleGrid.fn(as.data.frame(sample.dat.all.yr), obsCV, catchabilityPars, varName="eta_scaled")

# plot samples over surface of mean without observation error (taken as "true")
ggplot(sim.dat, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

Analysis - Design-based

Assuming that the area of each grid cell is 1, under simple random sampling the design-based estimate is the product of the mean of observations and the total area (1 * the number of grid cells in the domain). We then calculate the variance, standard error and 95% confidence interval.

N = 100*100
n = 100
index_db = samples %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n/N)) * (var(observation)/n)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n - 1) * se,
         upr = index + qt(0.975, df = n - 1) * se)

# plot index with 95% CI
ggplot(index_db, aes(x = year, y = index)) + 
  geom_line() + 
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  labs(y = "Total biomass estimate", 
       x = "Year")

We see that the biomass estimate increases drastically in year 4 and remains higher afterwards.

Analysis - Model-based

Fit model and check convergence and parameter values

# create mesh to size of samples
mesh = make_mesh(samples, xy_cols = c("X", "Y"), type = "cutoff_search", n_knots = 50)
## cutoff = 1.00 | knots = 1029 | ↓ 
## cutoff = 100.00 | knots = 24 | ↑ 
## cutoff = 10.00 | knots = 91 | ↓ 
## cutoff = 31.62 | knots = 34 | ↑ 
## cutoff = 17.78 | knots = 46 | ↑ 
## cutoff = 13.34 | knots = 61 | ↓ 
## cutoff = 15.40 | knots = 53 | ↓ 
## cutoff = 16.55 | knots = 49 | ↑ 
## cutoff = 15.96 | knots = 50 | ✔
plot(mesh)

model.fit = sdmTMB(
  formula = observation ~ 0 + as.factor(year), 
  data = samples,
  mesh = mesh,
  time = "year",
  family = lognormal(), 
  spatial = "on", # c("on", "off")
  spatiotemporal = "iid", # c("iid", "ar1", "rw", "off")
)

model.fit
## Spatiotemporal model fit by ML ['sdmTMB']
## Formula: observation ~ 0 + as.factor(year)
## Mesh: mesh (isotropic covariance)
## Time column: character
## Data: samples
## Family: lognormal(link = 'log')
##  
## Conditional model:
##                  coef.est coef.se
## as.factor(year)1    -0.09    0.09
## as.factor(year)2    -0.12    0.09
## as.factor(year)3    -0.05    0.09
## as.factor(year)4     0.27    0.09
## as.factor(year)5     0.16    0.09
## as.factor(year)6     0.13    0.09
## 
## Dispersion parameter: 0.38
## Matérn range: 32.07
## Spatial SD: 0.16
## Spatiotemporal IID SD: 0.15
## ML criterion at convergence: 322.435
## 
## See ?tidy.sdmTMB to extract these values as a data frame.
sanity(model.fit) # model checking
## ✔ Non-linear minimizer suggests successful convergence
## ✔ Hessian matrix is positive definite
## ✔ No extreme or very small eigenvalues detected
## ✔ No gradients with respect to fixed effects are >= 0.001
## ✔ No fixed-effect standard errors are NA
## ✔ No standard errors look unreasonably large
## ✔ No sigma parameters are < 0.01
## ✔ No sigma parameters are > 100
## ✔ Range parameter doesn't look unreasonably large

Inspect model for goodness of fit

# randomized quantile residuals
samples$resids = residuals(model.fit) 
hist(samples$resids) # looks ok but a bit left tailed

# qq plot
qqnorm(samples$resids)

# qqline not working?

# spatial residuals
ggplot(samples, aes(x = X, y = Y, color = resids)) + 
  scale_color_gradient2() +
  geom_point() + 
  coord_fixed() + 
  facet_wrap(~year) 

# hard for the untrained eye to assess if this looks good or not (I think it looks good?)

Predict from model to all locations and years

Predict to grid of full survey domain and map predictions and each component of the predictions (fixed and random effects) to see contributions of each to the full prediction.

# predict
predictions = predict(model.fit, newdata = grid.dat, return_tmb_object = TRUE)

# visualize predictions, then how fixed and random effects contribute
plot_map = function(sim.dat, column) { ggplot(sim.dat, aes(x = X, y = Y, fill = {{ column }})) + 
    geom_raster() + facet_wrap(~year) + coord_fixed() }

# full prediction
plot_map(predictions$data, exp(est)) + 
  scale_fill_viridis_c(trans = "sqrt") + 
  ggtitle("Prediction (fixed effects + all random effects)")

# fixed effects only (year)
plot_map(predictions$data, exp(est_non_rf)) + 
  ggtitle("Prediction (fixed effects only)") + 
  scale_fill_viridis_c(trans = "sqrt")

# spatial random effects
plot_map(predictions$data, omega_s) + 
  ggtitle("Spatial random effects only") + 
  scale_fill_gradient2()

# spatiotemporal random effects
plot_map(predictions$data, epsilon_st) + 
  ggtitle("Spatiotemporal random effects only") + 
  scale_fill_gradient2()

Compute abundance index

# we will assume that the area of each grid cell is 1
index_mb = get_index(predictions, area = 1, bias_correct = TRUE, level = 0.95) 
# desired confidence interval here is 95%

# plot index with 95% CI
ggplot(index_mb, aes(x = year, y = est)) + 
  geom_line() + 
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  labs(y = "Total biomass estimate", 
       x = "Year")

Similar pattern to the design-based but let’s compare them together.

Compare the two model ests.

head(index_db)
## # A tibble: 6 × 6
##    year  index variance    se    lwr    upr
##   <int>  <dbl>    <dbl> <dbl>  <dbl>  <dbl>
## 1     1  9858.  153010.  391.  9082. 10634.
## 2     2  9318.  156145.  395.  8534. 10102.
## 3     3 10185.  223676.  473.  9247. 11123.
## 4     4 13707.  276770.  526. 12663. 14751.
## 5     5 12229.  241179.  491. 11254. 13203.
## 6     6 12613.  365280.  604. 11414. 13812.
head(index_mb)
##   year       est       lwr      upr  log_est         se se_natural  type
## 1    1  9735.740  9010.906 10518.88 9.183559 0.03947420   9672.060 index
## 2    2  9399.384  8697.939 10157.40 9.148399 0.03957118   9336.304 index
## 3    3 10255.759  9485.777 11088.24 9.235595 0.03982010  10188.013 index
## 4    4 13954.490 12902.057 15092.77 9.543557 0.04000816  13861.578 index
## 5    5 12495.444 11505.547 13570.51 9.433119 0.04211038  12408.924 index
## 6    6 12397.966 11482.309 13386.64 9.425288 0.03914608  12313.304 index
index_db.b = index_db %>% select(year, index, lwr, upr) %>%
  mutate(type = "Design")
index_mb.b = index_mb %>% select(year, est, lwr, upr) %>%
  rename(index = est) %>%
  mutate(type = "Model")
index_real = sim.dat %>% group_by(year) %>% 
  summarize(index.eta = sum(eta_scaled))
index.all = rbind(index_db.b, index_mb.b)

# plot both together
ggplot() +
  geom_line(data = index.all, aes(x = year, y = index, color = type, fill = type)) +
  geom_ribbon(data = index.all, aes(x = year, y = index, color = type, fill = type, ymin = lwr, ymax = upr), 
              alpha = 0.3) +
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year")
## Warning in geom_line(data = index.all, aes(x = year, y = index, color = type, :
## Ignoring unknown aesthetics: fill

# looks pretty similar but see more differences from year 4-6

Exercise 1: Change number of sampling units.

Set up sampling units

Sampling 100 units in the 100x100 is the equivalent of sampling 1% of the entire grid. Let’s try 5% and 10%.

(100*100)*0.05
## [1] 500
# 500
(100*100)*0.1
## [1] 1000
# 1000

# sample from each year of survey data
set.seed(245)
sample.dat.1p = sim.dat %>% group_by(year) %>% slice_sample(n = 100)
sample.dat.5p = sim.dat %>% group_by(year) %>% slice_sample(n = 500)
sample.dat.10p = sim.dat %>% group_by(year) %>% slice_sample(n = 1000)

samples.1p = sampleGrid.fn(as.data.frame(sample.dat.1p), obsCV, catchabilityPars, varName="eta_scaled")
samples.5p = sampleGrid.fn(as.data.frame(sample.dat.5p), obsCV, catchabilityPars, varName="eta_scaled")
samples.10p = sampleGrid.fn(as.data.frame(sample.dat.10p), obsCV, catchabilityPars, varName="eta_scaled")

# plot samples over surface of mean without observation error (taken as "true")
# 1%
ggplot(sim.dat, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.1p, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

# 5%
ggplot(sim.dat, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.5p, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

# 10%
ggplot(sim.dat, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.10p, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

Wow looks like I might be confused? The 10% bubbles almost fill up the entire grid…but maybe that is just the plotting.

Analysis - Design-based

N = 100*100
n.1 = 100
n.5 = 500
n.10 = 1000

index_db.1p = samples.1p %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n.1/N)) * (var(observation)/n.1)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n.1 - 1) * se,
         upr = index + qt(0.975, df = n.1 - 1) * se,
         sample = "1perc")

index_db.5p = samples.5p %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n.5/N)) * (var(observation)/n.5)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n.5 - 1) * se,
         upr = index + qt(0.975, df = n.5 - 1) * se,
         sample = "5perc")

index_db.10p = samples.10p %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n.10/N)) * (var(observation)/n.10)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n.10 - 1) * se,
         upr = index + qt(0.975, df = n.10 - 1) * se,
         sample = "10perc")

index_db.allp = rbind(index_db.1p, index_db.5p, index_db.10p) %>%
  mutate(sample = factor(sample, levels = c("1perc", "5perc", "10perc")))

# plot index with 95% CI
ggplot() + 
  geom_line(data = index_db.allp, aes(x = year, y = index, color = sample, fill = sample), size = 2) + 
  geom_ribbon(data = index_db.allp, aes(x = year, y = index, color = sample, fill = sample, ymin = lwr, ymax = upr),
              alpha = 0.25) + 
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_line(data = index_db.allp, aes(x = year, y = index, color =
## sample, : Ignoring unknown aesthetics: fill

1% and 10% follow similar trend, but the 5% misses the slight dip in Y5. 1% though misses decline in Y6. Unsurprisingly, the higher the sample size the smaller the confidence intervals around the estimate. What is interesting though is that in the 1% situation, the larger confidence intervals end up overlapping with the true data, but the 5 and 10% ones don’t. So in a way, the larger sample size decrease in CI shows greater “precision” but it is not?

Analysis - Model-based

# create mesh to size of samples
mesh.1p = make_mesh(samples.1p, xy_cols = c("X", "Y"), type = "cutoff_search", n_knots = 50)
## cutoff = 1.00 | knots = 1060 | ↓ 
## cutoff = 100.00 | knots = 24 | ↑ 
## cutoff = 10.00 | knots = 91 | ↓ 
## cutoff = 31.62 | knots = 32 | ↑ 
## cutoff = 17.78 | knots = 47 | ↑ 
## cutoff = 13.34 | knots = 66 | ↓ 
## cutoff = 15.40 | knots = 56 | ↓ 
## cutoff = 16.55 | knots = 53 | ↓ 
## cutoff = 17.15 | knots = 52 | ↓ 
## cutoff = 17.47 | knots = 48 | ↑ 
## cutoff = 17.31 | knots = 51 | ↓ 
## cutoff = 17.39 | knots = 51 | ↓ 
## cutoff = 17.43 | knots = 51 | ↓ 
## cutoff = 17.45 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 48 | ↑ 
## cutoff = 17.46 | knots = 48 | ↑ 
## cutoff = 17.46 | knots = 48
plot(mesh.1p)

mesh.5p = make_mesh(samples.5p, xy_cols = c("X", "Y"), type = "cutoff_search", n_knots = 75)
## cutoff = 1.00 | knots = 3499 | ↓ 
## cutoff = 100.00 | knots = 24 | ↑ 
## cutoff = 10.00 | knots = 101 | ↓ 
## cutoff = 31.62 | knots = 37 | ↑ 
## cutoff = 17.78 | knots = 52 | ↑ 
## cutoff = 13.34 | knots = 67 | ↑ 
## cutoff = 11.55 | knots = 79 | ↓ 
## cutoff = 12.41 | knots = 72 | ↑ 
## cutoff = 11.97 | knots = 78 | ↓ 
## cutoff = 12.19 | knots = 73 | ↑ 
## cutoff = 12.08 | knots = 77 | ↓ 
## cutoff = 12.13 | knots = 75 | ✔
plot(mesh.5p)

mesh.10p = make_mesh(samples.10p, xy_cols = c("X", "Y"), type = "cutoff_search", n_knots = 100)
## cutoff = 1.00 | knots = 5431 | ↓ 
## cutoff = 100.00 | knots = 24 | ↑ 
## cutoff = 10.00 | knots = 101 | ↓ 
## cutoff = 31.62 | knots = 34 | ↑ 
## cutoff = 17.78 | knots = 46 | ↑ 
## cutoff = 13.34 | knots = 64 | ↑ 
## cutoff = 11.55 | knots = 85 | ↑ 
## cutoff = 10.75 | knots = 92 | ↑ 
## cutoff = 10.37 | knots = 96 | ↑ 
## cutoff = 10.18 | knots = 97 | ↑ 
## cutoff = 10.09 | knots = 97 | ↑ 
## cutoff = 10.04 | knots = 100 | ✔
plot(mesh.10p)

model.fit.1p = sdmTMB(
  formula = observation ~ 0 + as.factor(year), 
  data = samples.1p,
  mesh = mesh.1p,
  time = "year",
  family = lognormal(), 
  spatial = "on", # c("on", "off")
  spatiotemporal = "iid", # c("iid", "ar1", "rw", "off")
)
model.fit.1p
## Spatiotemporal model fit by ML ['sdmTMB']
## Formula: observation ~ 0 + as.factor(year)
## Mesh: mesh.1p (isotropic covariance)
## Time column: character
## Data: samples.1p
## Family: lognormal(link = 'log')
##  
## Conditional model:
##                  coef.est coef.se
## as.factor(year)1     0.00    0.09
## as.factor(year)2    -0.02    0.09
## as.factor(year)3     0.08    0.09
## as.factor(year)4     0.16    0.09
## as.factor(year)5     0.19    0.09
## as.factor(year)6     0.14    0.09
## 
## Dispersion parameter: 0.37
## Matérn range: 32.05
## Spatial SD: 0.10
## Spatiotemporal IID SD: 0.20
## ML criterion at convergence: 337.549
## 
## See ?tidy.sdmTMB to extract these values as a data frame.
sanity(model.fit.1p) # model checking: good
## ✔ Non-linear minimizer suggests successful convergence
## ✔ Hessian matrix is positive definite
## ✔ No extreme or very small eigenvalues detected
## ✔ No gradients with respect to fixed effects are >= 0.001
## ✔ No fixed-effect standard errors are NA
## ✔ No standard errors look unreasonably large
## ✔ No sigma parameters are < 0.01
## ✔ No sigma parameters are > 100
## ✔ Range parameter doesn't look unreasonably large
model.fit.5p = sdmTMB(
  formula = observation ~ 0 + as.factor(year), 
  data = samples.5p,
  mesh = mesh.5p,
  time = "year",
  family = lognormal(), 
  spatial = "on", # c("on", "off")
  spatiotemporal = "iid", # c("iid", "ar1", "rw", "off")
)
model.fit.5p
## Spatiotemporal model fit by ML ['sdmTMB']
## Formula: observation ~ 0 + as.factor(year)
## Mesh: mesh.5p (isotropic covariance)
## Time column: character
## Data: samples.5p
## Family: lognormal(link = 'log')
##  
## Conditional model:
##                  coef.est coef.se
## as.factor(year)1    -0.07    0.11
## as.factor(year)2    -0.11    0.11
## as.factor(year)3     0.11    0.11
## as.factor(year)4     0.23    0.11
## as.factor(year)5     0.19    0.11
## as.factor(year)6     0.09    0.11
## 
## Dispersion parameter: 0.38
## Matérn range: 42.22
## Spatial SD: 0.11
## Spatiotemporal IID SD: 0.21
## ML criterion at convergence: 1676.926
## 
## See ?tidy.sdmTMB to extract these values as a data frame.
sanity(model.fit.5p) # model checking: good
## ✔ Non-linear minimizer suggests successful convergence
## ✔ Hessian matrix is positive definite
## ✔ No extreme or very small eigenvalues detected
## ✔ No gradients with respect to fixed effects are >= 0.001
## ✔ No fixed-effect standard errors are NA
## ✔ No standard errors look unreasonably large
## ✔ No sigma parameters are < 0.01
## ✔ No sigma parameters are > 100
## ✔ Range parameter doesn't look unreasonably large
model.fit.10p = sdmTMB(
  formula = observation ~ 0 + as.factor(year), 
  data = samples.10p,
  mesh = mesh.10p,
  time = "year",
  family = lognormal(), 
  spatial = "on", # c("on", "off")
  spatiotemporal = "iid", # c("iid", "ar1", "rw", "off")
)
model.fit.10p
## Spatiotemporal model fit by ML ['sdmTMB']
## Formula: observation ~ 0 + as.factor(year)
## Mesh: mesh.10p (isotropic covariance)
## Time column: character
## Data: samples.10p
## Family: lognormal(link = 'log')
##  
## Conditional model:
##                  coef.est coef.se
## as.factor(year)1    -0.05    0.11
## as.factor(year)2    -0.07    0.11
## as.factor(year)3     0.08    0.11
## as.factor(year)4     0.21    0.11
## as.factor(year)5     0.17    0.11
## as.factor(year)6     0.14    0.11
## 
## Dispersion parameter: 0.38
## Matérn range: 46.79
## Spatial SD: 0.10
## Spatiotemporal IID SD: 0.21
## ML criterion at convergence: 3209.667
## 
## See ?tidy.sdmTMB to extract these values as a data frame.
sanity(model.fit.10p) # model checking: good
## ✔ Non-linear minimizer suggests successful convergence
## ✔ Hessian matrix is positive definite
## ✔ No extreme or very small eigenvalues detected
## ✔ No gradients with respect to fixed effects are >= 0.001
## ✔ No fixed-effect standard errors are NA
## ✔ No standard errors look unreasonably large
## ✔ No sigma parameters are < 0.01
## ✔ No sigma parameters are > 100
## ✔ Range parameter doesn't look unreasonably large
# randomized quantile residuals
samples.1p$resids = residuals(model.fit.1p) 
hist(samples.1p$resids) # looks ok but a bit left tailed

samples.5p$resids = residuals(model.fit.5p) 
hist(samples.5p$resids) # looks ok but stronger tails?

samples.10p$resids = residuals(model.fit.10p) 
hist(samples.10p$resids) # ooo strong left tailed

# qq plot
qqnorm(samples.1p$resids)

qqnorm(samples.5p$resids)

qqnorm(samples.10p$resids) # oh ya can see left tail issue

# spatial residuals
ggplot(samples.1p, aes(x = X, y = Y, color = resids)) + 
  scale_color_gradient2() +
  geom_point() + 
  coord_fixed() + 
  facet_wrap(~year)

ggplot(samples.5p, aes(x = X, y = Y, color = resids)) + 
  scale_color_gradient2() +
  geom_point() + 
  coord_fixed() + 
  facet_wrap(~year) 

ggplot(samples.10p, aes(x = X, y = Y, color = resids)) + 
  scale_color_gradient2() +
  geom_point() + 
  coord_fixed() + 
  facet_wrap(~year) 

# visualize predictions, then how fixed and random effects contribute
plot_map = function(sim.dat, column) { ggplot(sim.dat, aes(x = X, y = Y, fill = {{ column }})) + 
    geom_raster() + facet_wrap(~year) + coord_fixed() }

# predict
predictions.1p = predict(model.fit.1p, newdata = grid.dat, return_tmb_object = TRUE)
predictions.5p = predict(model.fit.5p, newdata = grid.dat, return_tmb_object = TRUE)
predictions.10p = predict(model.fit.10p, newdata = grid.dat, return_tmb_object = TRUE)

# full prediciton
plot_map(predictions.1p$data, exp(est)) + 
  scale_fill_viridis_c(trans = "sqrt") + 
  ggtitle("Prediction (fixed effects + all random effects)")

plot_map(predictions.5p$data, exp(est)) + 
  scale_fill_viridis_c(trans = "sqrt") + 
  ggtitle("Prediction (fixed effects + all random effects)")

plot_map(predictions.10p$data, exp(est)) + 
  scale_fill_viridis_c(trans = "sqrt") + 
  ggtitle("Prediction (fixed effects + all random effects)")

# fixed effects
plot_map(predictions.1p$data, exp(est_non_rf)) + 
  ggtitle("Prediction (fixed effects only)") + 
  scale_fill_viridis_c(trans = "sqrt")

plot_map(predictions.5p$data, exp(est_non_rf)) + 
  ggtitle("Prediction (fixed effects only)") + 
  scale_fill_viridis_c(trans = "sqrt")

plot_map(predictions.10p$data, exp(est_non_rf)) + 
  ggtitle("Prediction (fixed effects only)") + 
  scale_fill_viridis_c(trans = "sqrt")

# spatial random only
plot_map(predictions.1p$data, omega_s) + 
  ggtitle("Spatial random effects only") + 
  scale_fill_gradient2()

plot_map(predictions.5p$data, omega_s) + 
  ggtitle("Spatial random effects only") + 
  scale_fill_gradient2()

plot_map(predictions.10p$data, omega_s) + 
  ggtitle("Spatial random effects only") + 
  scale_fill_gradient2()

# spatiaotemporal random only
plot_map(predictions.1p$data, epsilon_st) + 
  ggtitle("Spatiotemporal random effects only") + 
  scale_fill_gradient2()

plot_map(predictions.5p$data, epsilon_st) + 
  ggtitle("Spatiotemporal random effects only") + 
  scale_fill_gradient2()

plot_map(predictions.10p$data, epsilon_st) + 
  ggtitle("Spatiotemporal random effects only") + 
  scale_fill_gradient2()

# we will assume that the area of each grid cell is 1
index_mb.1p = get_index(predictions.1p, area = 1, bias_correct = TRUE, level = 0.95) %>% 
  mutate(sample = "1perc")
index_mb.5p = get_index(predictions.5p, area = 1, bias_correct = TRUE, level = 0.95) %>%
  mutate(sample = "5perc")
index_mb.10p = get_index(predictions.10p, area = 1, bias_correct = TRUE, level = 0.95) %>%
  mutate(sample = "10perc")
index_mb.allp = rbind(index_mb.1p, index_mb.5p, index_mb.10p) %>%
  mutate(sample = factor(sample, levels = c("1perc", "5perc", "10perc")))
# desired confidence interval here is 95%

# plot index with 95% CI
ggplot() + 
  geom_line(data = index_mb.allp, aes(x = year, y = est, color = sample, fill = sample), size = 2) + 
  geom_ribbon(data = index_mb.allp, aes(x = year, y = est, color = sample, fill = sample, ymin = lwr, ymax = upr),
              alpha = 0.25) + 
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year")
## Warning in geom_line(data = index_mb.allp, aes(x = year, y = est, color =
## sample, : Ignoring unknown aesthetics: fill

Compare the two model ests.

head(index_db.allp)
## # A tibble: 6 × 7
##    year  index variance    se    lwr    upr sample
##   <int>  <dbl>    <dbl> <dbl>  <dbl>  <dbl> <fct> 
## 1     1 10280.  161777.  402.  9481. 11078. 1perc 
## 2     2 10219.  191047.  437.  9352. 11087. 1perc 
## 3     3 11269.  253750.  504. 10270. 12269. 1perc 
## 4     4 12580.  258042.  508. 11572. 13588. 1perc 
## 5     5 12419.  189240.  435. 11555. 13282. 1perc 
## 6     6 12062.  213773.  462. 11145. 12980. 1perc
head(index_mb.allp)
##   year      est       lwr      upr  log_est         se se_natural  type sample
## 1    1 10405.67  9615.994 11260.20 9.250106 0.04026771   10326.33 index  1perc
## 2    2 10325.35  9535.587 11180.53 9.242357 0.04059839   10240.77 index  1perc
## 3    3 11394.80 10557.159 12298.89 9.340912 0.03895610   11307.67 index  1perc
## 4    4 12774.62 11812.771 13814.79 9.455216 0.03993921   12673.60 index  1perc
## 5    5 12670.17 11731.789 13683.61 9.447006 0.03926002   12571.57 index  1perc
## 6    6 11960.17 11068.958 12923.14 9.389337 0.03950969   11867.65 index  1perc
index_db.allp.b = index_db.allp %>% select(year, sample, index, lwr, upr) %>%
  mutate(type = "Design")
index_mb.allp.b = index_mb.allp %>% select(year, sample, est, lwr, upr) %>%
  rename(index = est) %>%
  mutate(type = "Model")
index.allp = rbind(index_db.allp.b, index_mb.allp.b)
head(index.allp)
## # A tibble: 6 × 6
##    year sample  index    lwr    upr type  
##   <int> <fct>   <dbl>  <dbl>  <dbl> <chr> 
## 1     1 1perc  10280.  9481. 11078. Design
## 2     2 1perc  10219.  9352. 11087. Design
## 3     3 1perc  11269. 10270. 12269. Design
## 4     4 1perc  12580. 11572. 13588. Design
## 5     5 1perc  12419. 11555. 13282. Design
## 6     6 1perc  12062. 11145. 12980. Design
# plot both together
ggplot() +
  geom_line(data = index.allp, aes(x = year, y = index, color = type, fill = type)) +
  geom_ribbon(data = index.allp, aes(x = year, y = index, color = type, fill = type, ymin = lwr, ymax = upr), 
              alpha = 0.3) +
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year") +
  facet_grid(.~sample)
## Warning in geom_line(data = index.allp, aes(x = year, y = index, color = type,
## : Ignoring unknown aesthetics: fill

Ok main takeaway is that sampling more in the grid does not necessarily improve your estimate of abundance. Idk why. How does oversampling work? Is it the same idea as “overfitting” a model or did I not deal with the increase in sample size well?

Exercise 2: Cannot sample shallow water (negDepth > -10)

Set up sampling units

More negative scaled depth = shallower

grid.dat %>% filter(negDepth >= -10) %>% group_by(year) %>% summarize(mds = mean(max(depth_scaled)))
## # A tibble: 6 × 2
##    year   mds
##   <int> <dbl>
## 1     1 -1.03
## 2     2 -1.04
## 3     3 -1.04
## 4     4 -1.06
## 5     5 -1.07
## 6     6 -1.06
# -1.06
# check 
ggplot(subset(grid.dat, depth_scaled >= -1.06), aes(x = X, y = Y)) + 
  geom_raster(aes(fill = negDepth)) +
  scale_fill_viridis() +
  scale_x_continuous(expand = c(0, 0)) + # Remove expansion on x-axis
  scale_y_continuous(expand = c(0, 0)) # Remove expansion on x-axis

# sample from each year of survey data
set.seed(245)
sample.dat = sim.dat %>% group_by(year) %>% slice_sample(n = 100)
sample.dat.no.s = sim.dat %>% filter(depth_scaled >= -1.06) %>% 
  group_by(year) %>% slice_sample(n = 100)

samples = sampleGrid.fn(as.data.frame(sample.dat), obsCV, catchabilityPars, varName="eta_scaled")
samples.no.s = sampleGrid.fn(as.data.frame(sample.dat.no.s), obsCV, catchabilityPars, varName="eta_scaled")

# plot samples over surface of mean without observation error (taken as "true")
# all
ggplot(sim.dat, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

# no shallow
ggplot(sim.dat, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.no.s, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

Can see no sampling in shallow regions (far left and right side for this grid).

Analysis - Design-based

N = 100*100
n = 100

index_db = samples %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n/N)) * (var(observation)/n)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n - 1) * se,
         upr = index + qt(0.975, df = n - 1) * se,
         sample = "all")

index_db.no.s = samples.no.s %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n/N)) * (var(observation)/n)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n - 1) * se,
         upr = index + qt(0.975, df = n - 1) * se,
         sample = "no shallow")

index_db.alld = rbind(index_db, index_db.no.s)

# plot index with 95% CI
ggplot() + 
  geom_line(data = index_db.alld, aes(x = year, y = index, color = sample, fill = sample), size = 2) + 
  geom_ribbon(data = index_db.alld, aes(x = year, y = index, color = sample, fill = sample, ymin = lwr, ymax = upr),
              alpha = 0.25) + 
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year")
## Warning in geom_line(data = index_db.alld, aes(x = year, y = index, color =
## sample, : Ignoring unknown aesthetics: fill

It is better in some parts (towards Y4-6), so perhaps that is a spatial thing occurring …have more issues with no shallow in the earlier years.

Analysis - Model-based

# create mesh to size of samples
mesh = make_mesh(samples, xy_cols = c("X", "Y"), type = "cutoff_search", n_knots = 50)
## cutoff = 1.00 | knots = 1060 | ↓ 
## cutoff = 100.00 | knots = 24 | ↑ 
## cutoff = 10.00 | knots = 91 | ↓ 
## cutoff = 31.62 | knots = 32 | ↑ 
## cutoff = 17.78 | knots = 47 | ↑ 
## cutoff = 13.34 | knots = 66 | ↓ 
## cutoff = 15.40 | knots = 56 | ↓ 
## cutoff = 16.55 | knots = 53 | ↓ 
## cutoff = 17.15 | knots = 52 | ↓ 
## cutoff = 17.47 | knots = 48 | ↑ 
## cutoff = 17.31 | knots = 51 | ↓ 
## cutoff = 17.39 | knots = 51 | ↓ 
## cutoff = 17.43 | knots = 51 | ↓ 
## cutoff = 17.45 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 51 | ↓ 
## cutoff = 17.46 | knots = 48 | ↑ 
## cutoff = 17.46 | knots = 48 | ↑ 
## cutoff = 17.46 | knots = 48
plot(mesh)

mesh.no.s = make_mesh(samples.no.s, xy_cols = c("X", "Y"), type = "cutoff_search", n_knots = 75)
## cutoff = 1.00 | knots = 1022 | ↓ 
## cutoff = 100.00 | knots = 31 | ↑ 
## cutoff = 10.00 | knots = 78 | ↓ 
## cutoff = 31.62 | knots = 34 | ↑ 
## cutoff = 17.78 | knots = 43 | ↑ 
## cutoff = 13.34 | knots = 58 | ↑ 
## cutoff = 11.55 | knots = 68 | ↑ 
## cutoff = 10.75 | knots = 72 | ↑ 
## cutoff = 10.37 | knots = 73 | ↑ 
## cutoff = 10.18 | knots = 72 | ↑ 
## cutoff = 10.09 | knots = 72 | ↑ 
## cutoff = 10.04 | knots = 74 | ↑ 
## cutoff = 10.02 | knots = 74 | ↑ 
## cutoff = 10.01 | knots = 74 | ↑ 
## cutoff = 10.01 | knots = 74 | ↑ 
## cutoff = 10.00 | knots = 74 | ↑ 
## cutoff = 10.00 | knots = 74 | ↑ 
## cutoff = 10.00 | knots = 74 | ↑ 
## cutoff = 10.00 | knots = 74 | ↑ 
## cutoff = 10.00 | knots = 74
plot(mesh.no.s)

model.fit = sdmTMB(
  formula = observation ~ 0 + as.factor(year), 
  data = samples,
  mesh = mesh,
  time = "year",
  family = lognormal(), 
  spatial = "on", # c("on", "off")
  spatiotemporal = "iid", # c("iid", "ar1", "rw", "off")
)
model.fit
## Spatiotemporal model fit by ML ['sdmTMB']
## Formula: observation ~ 0 + as.factor(year)
## Mesh: mesh (isotropic covariance)
## Time column: character
## Data: samples
## Family: lognormal(link = 'log')
##  
## Conditional model:
##                  coef.est coef.se
## as.factor(year)1    -0.06    0.08
## as.factor(year)2    -0.09    0.08
## as.factor(year)3     0.12    0.08
## as.factor(year)4     0.26    0.08
## as.factor(year)5     0.21    0.08
## as.factor(year)6     0.17    0.08
## 
## Dispersion parameter: 0.37
## Matérn range: 23.56
## Spatial SD: 0.13
## Spatiotemporal IID SD: 0.22
## ML criterion at convergence: 332.958
## 
## See ?tidy.sdmTMB to extract these values as a data frame.
sanity(model.fit) # model checking: good
## ✔ Non-linear minimizer suggests successful convergence
## ✔ Hessian matrix is positive definite
## ✔ No extreme or very small eigenvalues detected
## ✔ No gradients with respect to fixed effects are >= 0.001
## ✔ No fixed-effect standard errors are NA
## ✔ No standard errors look unreasonably large
## ✔ No sigma parameters are < 0.01
## ✔ No sigma parameters are > 100
## ✔ Range parameter doesn't look unreasonably large
model.fit.no.s = sdmTMB(
  formula = observation ~ 0 + as.factor(year), 
  data = samples.no.s,
  mesh = mesh.no.s,
  time = "year",
  family = lognormal(), 
  spatial = "on", # c("on", "off")
  spatiotemporal = "iid", # c("iid", "ar1", "rw", "off")
)
model.fit.no.s
## Spatiotemporal model fit by ML ['sdmTMB']
## Formula: observation ~ 0 + as.factor(year)
## Mesh: mesh.no.s (isotropic covariance)
## Time column: character
## Data: samples.no.s
## Family: lognormal(link = 'log')
##  
## Conditional model:
##                  coef.est coef.se
## as.factor(year)1     0.08    0.09
## as.factor(year)2     0.00    0.09
## as.factor(year)3     0.14    0.09
## as.factor(year)4     0.26    0.09
## as.factor(year)5     0.21    0.09
## as.factor(year)6     0.08    0.09
## 
## Dispersion parameter: 0.39
## Matérn range: 39.41
## Spatial SD: 0.13
## Spatiotemporal IID SD: 0.13
## ML criterion at convergence: 358.812
## 
## See ?tidy.sdmTMB to extract these values as a data frame.
sanity(model.fit.no.s) # model checking: good
## ✔ Non-linear minimizer suggests successful convergence
## ✔ Hessian matrix is positive definite
## ✔ No extreme or very small eigenvalues detected
## ✔ No gradients with respect to fixed effects are >= 0.001
## ✔ No fixed-effect standard errors are NA
## ✔ No standard errors look unreasonably large
## ✔ No sigma parameters are < 0.01
## ✔ No sigma parameters are > 100
## ✔ Range parameter doesn't look unreasonably large
# randomized quantile residuals
samples$resids = residuals(model.fit) 
hist(samples$resids) # looks ok but a bit left tailed

samples.no.s$resids = residuals(model.fit.no.s) 
hist(samples.no.s$resids) # looks actually a little better

# qq plot
qqnorm(samples$resids)

qqnorm(samples.no.s$resids)

# spatial residuals
ggplot(samples, aes(x = X, y = Y, color = resids)) + 
  scale_color_gradient2() +
  geom_point() + 
  coord_fixed() + 
  facet_wrap(~year)

ggplot(samples.no.s, aes(x = X, y = Y, color = resids)) + 
  scale_color_gradient2() +
  geom_point() + 
  coord_fixed() + 
  facet_wrap(~year) 

# visualize predictions, then how fixed and random effects contribute
plot_map = function(sim.dat, column) { ggplot(sim.dat, aes(x = X, y = Y, fill = {{ column }})) + 
    geom_raster() + facet_wrap(~year) + coord_fixed() }

# predict
predictions = predict(model.fit, newdata = grid.dat, return_tmb_object = TRUE)
predictions.no.s = predict(model.fit.no.s, newdata = grid.dat, return_tmb_object = TRUE)

# full prediciton
plot_map(predictions$data, exp(est)) + 
  scale_fill_viridis_c(trans = "sqrt") + 
  ggtitle("Prediction (fixed effects + all random effects)")

plot_map(predictions.no.s$data, exp(est)) + 
  scale_fill_viridis_c(trans = "sqrt") + 
  ggtitle("Prediction (fixed effects + all random effects)")

# fixed effects
plot_map(predictions$data, exp(est_non_rf)) + 
  ggtitle("Prediction (fixed effects only)") + 
  scale_fill_viridis_c(trans = "sqrt")

plot_map(predictions.no.s$data, exp(est_non_rf)) + 
  ggtitle("Prediction (fixed effects only)") + 
  scale_fill_viridis_c(trans = "sqrt")

# spatial random only
plot_map(predictions$data, omega_s) + 
  ggtitle("Spatial random effects only") + 
  scale_fill_gradient2()

plot_map(predictions.no.s$data, omega_s) + 
  ggtitle("Spatial random effects only") + 
  scale_fill_gradient2()

# oh is my mesh not big enough ?
# increased mesh but still having corners cut off

# spatiaotemporal random only
plot_map(predictions$data, epsilon_st) + 
  ggtitle("Spatiotemporal random effects only") + 
  scale_fill_gradient2()

plot_map(predictions.no.s$data, epsilon_st) + 
  ggtitle("Spatiotemporal random effects only") + 
  scale_fill_gradient2()

# we will assume that the area of each grid cell is 1
index_mb = get_index(predictions, area = 1, bias_correct = TRUE, level = 0.95) %>% 
  mutate(sample = "all")
index_mb.no.s = get_index(predictions.no.s, area = 1, bias_correct = TRUE, level = 0.95) %>%
  mutate(sample = "no shallow")
index_mb.alld = rbind(index_mb, index_mb.no.s)
# desired confidence interval here is 95%

# plot index with 95% CI
ggplot() + 
  geom_line(data = index_mb.alld, aes(x = year, y = est, color = sample, fill = sample), size = 2) + 
  geom_ribbon(data = index_mb.alld, aes(x = year, y = est, color = sample, fill = sample, ymin = lwr, ymax = upr),
              alpha = 0.25) + 
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year")
## Warning in geom_line(data = index_mb.alld, aes(x = year, y = est, color =
## sample, : Ignoring unknown aesthetics: fill

Compare the two model ests.

head(index_db.alld)
## # A tibble: 6 × 7
##    year  index variance    se    lwr    upr sample
##   <int>  <dbl>    <dbl> <dbl>  <dbl>  <dbl> <chr> 
## 1     1  9765.  148473.  385.  9000. 10529. all   
## 2     2  9293.  170693.  413.  8473. 10113. all   
## 3     3 11684.  303895.  551. 10590. 12778. all   
## 4     4 13346.  319999.  566. 12224. 14469. all   
## 5     5 12423.  184141.  429. 11572. 13275. all   
## 6     6 12564.  314422.  561. 11451. 13677. all
head(index_mb.alld)
##   year       est       lwr      upr  log_est         se se_natural  type sample
## 1    1  9660.373  8923.775 10457.77 9.175787 0.04046664   9583.573 index    all
## 2    2  9392.298  8679.061 10164.15 9.147645 0.04029490   9312.828 index    all
## 3    3 11789.339 10923.325 12724.01 9.374951 0.03892682  11694.318 index    all
## 4    4 13466.033 12457.756 14555.92 9.507926 0.03970840  13354.734 index    all
## 5    5 12715.550 11775.592 13730.54 9.450581 0.03918271  12611.376 index    all
## 6    6 12150.301 11248.871 13123.97 9.405109 0.03933040  12051.655 index    all
index_db.alld.b = index_db.alld %>% select(year, sample, index, lwr, upr) %>%
  mutate(type = "Design")
index_mb.alld.b = index_mb.alld %>% select(year, sample, est, lwr, upr) %>%
  rename(index = est) %>%
  mutate(type = "Model")
index.alld = rbind(index_db.alld.b, index_mb.alld.b)
head(index.alld)
## # A tibble: 6 × 6
##    year sample  index    lwr    upr type  
##   <int> <chr>   <dbl>  <dbl>  <dbl> <chr> 
## 1     1 all     9765.  9000. 10529. Design
## 2     2 all     9293.  8473. 10113. Design
## 3     3 all    11684. 10590. 12778. Design
## 4     4 all    13346. 12224. 14469. Design
## 5     5 all    12423. 11572. 13275. Design
## 6     6 all    12564. 11451. 13677. Design
# plot both together
ggplot() +
  geom_line(data = index.alld, aes(x = year, y = index, color = type, fill = type)) +
  geom_ribbon(data = index.alld, aes(x = year, y = index, color = type, fill = type, ymin = lwr, ymax = upr), 
              alpha = 0.3) +
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year") +
  facet_grid(.~sample)
## Warning in geom_line(data = index.alld, aes(x = year, y = index, color = type,
## : Ignoring unknown aesthetics: fill

Similar assessment between both of the models which is they do better picking up the downward turn towards later part of time series and do not do well in the earlier part of time series (well overestimated).

Exercies 3: Test out stratified sampling.

From year 1, we think we need to split up top into 2 strata (3 strata total)

Test grid for 3 strata

head(sim.dat)
##   year X Y      eta  observed eta_scaled observed_scaled depth_scaled
## 1    1 1 1 17.04969 26.029819  0.7428023       0.7252767    -1.639687
## 2    1 2 1 17.37308 30.805359  0.7568914       0.8583390    -1.572175
## 3    1 3 1 17.69647 22.991022  0.7709806       0.6406058    -1.504663
## 4    1 4 1 18.01986  9.307899  0.7850697       0.2593488    -1.437152
## 5    1 5 1 18.34325 24.644069  0.7991588       0.6866651    -1.369640
## 6    1 6 1 18.66665 24.320600  0.8132480       0.6776522    -1.302129
# based off Year 1, more obs in Y50-100

test = sim.dat %>% 
  mutate(Stratum = ifelse(X <= 49 & Y > 49, "TL", 
                          ifelse(X > 49 & Y > 49, "TR", "B")))

ggplot(subset(test, year == 1), aes(x = X, y = Y)) +
  geom_raster(aes(fill = Stratum)) +
  #scale_fill_viridis() +  
  scale_x_continuous(expand = c(0, 0)) + # Remove expansion on x-axis
  scale_y_continuous(expand = c(0, 0)) 

# ok it works! 

sim.dat.strata = sim.dat %>% 
  mutate(Stratum = ifelse(X <= 49 & Y > 49, "TL", 
                          ifelse(X > 49 & Y > 49, "TR", "B")))
head(sim.dat.strata)
##   year X Y      eta  observed eta_scaled observed_scaled depth_scaled Stratum
## 1    1 1 1 17.04969 26.029819  0.7428023       0.7252767    -1.639687       B
## 2    1 2 1 17.37308 30.805359  0.7568914       0.8583390    -1.572175       B
## 3    1 3 1 17.69647 22.991022  0.7709806       0.6406058    -1.504663       B
## 4    1 4 1 18.01986  9.307899  0.7850697       0.2593488    -1.437152       B
## 5    1 5 1 18.34325 24.644069  0.7991588       0.6866651    -1.369640       B
## 6    1 6 1 18.66665 24.320600  0.8132480       0.6776522    -1.302129       B

Set up sampling units

Need different sampling “n” because not 100 for each stratum.

# sample from each year of survey data
set.seed(245)
sample.dat = sim.dat.strata %>% group_by(year) %>% slice_sample(n = 100)
sample.dat.TL = sim.dat.strata %>% filter(Stratum == "TL") %>% 
  group_by(year) %>% slice_sample(n = 25)
sample.dat.TR = sim.dat.strata %>% filter(Stratum == "TR") %>% 
  group_by(year) %>% slice_sample(n = 25)
sample.dat.B = sim.dat.strata %>% filter(Stratum == "B") %>% 
  group_by(year) %>% slice_sample(n = 50)

samples = sampleGrid.fn(as.data.frame(sample.dat), obsCV, catchabilityPars, varName="eta_scaled")
samples.TL = sampleGrid.fn(as.data.frame(sample.dat.TL), obsCV, catchabilityPars, varName="eta_scaled")
samples.TR = sampleGrid.fn(as.data.frame(sample.dat.TR), obsCV, catchabilityPars, varName="eta_scaled")
samples.B = sampleGrid.fn(as.data.frame(sample.dat.B), obsCV, catchabilityPars, varName="eta_scaled")

# plot samples over surface of mean without observation error (taken as "true")
# all
ggplot(sim.dat.strata, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

# TL
ggplot(sim.dat.strata, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.TL, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

# TR
ggplot(sim.dat.strata, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.TR, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

# B
ggplot(sim.dat.strata, aes(x = X, y = Y)) +
  geom_raster(aes(fill = eta_scaled)) +
  geom_point(data = samples.B, aes(size = observation), pch = 21) +
  scale_fill_viridis() + 
  scale_size_area() +
  coord_cartesian(expand = FALSE)   +
  facet_wrap(~year)

Sampling occuring in respective strata…if equal sample size for each strata and for whole grid…do we get same answer?

Analysis - Design-based

N = 100*100
n = 100
nT = 25
nB = 50

index_db = samples %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(n/N)) * (var(observation)/n)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = n - 1) * se,
         upr = index + qt(0.975, df = n - 1) * se,
         sample = "all")

index_db.TL = samples.TL %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(nT/N)) * (var(observation)/nT)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = nT - 1) * se,
         upr = index + qt(0.975, df = nT - 1) * se,
         sample = "Top Left")

index_db.TR = samples.TR %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(nT/N)) * (var(observation)/nT)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = nT - 1) * se,
         upr = index + qt(0.975, df = nT - 1) * se,
         sample = "Top Right")

index_db.B = samples.B %>%
  group_by(year) %>%
  summarize(index = mean(observation) * N, 
            variance = N^2 * (1-(nB/N)) * (var(observation)/nB)) %>%
  mutate(se = sqrt(variance),
         lwr = index - qt(0.975, df = nB - 1) * se,
         upr = index + qt(0.975, df = nB - 1) * se,
         sample = "Bottom")

index_db.strata = rbind(index_db.TL, index_db.TR, index_db.B) %>%
  group_by(year) %>%
  summarize(index = mean(index),
            variance = mean(variance),
            se = mean(se), 
            lwr = mean(lwr), 
            upr = mean(upr)) %>%
  mutate(sample = "Strata")

index_db.allstr = rbind(index_db, index_db.strata)

# plot index with 95% CI
ggplot() + 
  geom_line(data = index_db.allstr, aes(x = year, y = index, color = sample, fill = sample), size = 2) + 
  geom_ribbon(data = index_db.allstr, aes(x = year, y = index, color = sample, fill = sample, ymin = lwr, ymax = upr),
              alpha = 0.25) + 
  geom_point(data = index_real, aes(x = year, y = index.eta), size = 3, color = "black") +
  labs(y = "Total biomass estimate", 
       x = "Year")
## Warning in geom_line(data = index_db.allstr, aes(x = year, y = index, color =
## sample, : Ignoring unknown aesthetics: fill

STOPPING HERE—I DON’T UNDERSTAND HOW TO GET ABUDNANCE FROM STRATA ********